Code
import nltk
import os
import string
import matplotlib.pyplot as plt
nltk.download('punkt') # Download NLTK tokenizer data
# Define a function to remove all punctuation except hyphens
def remove_punctuation_except_hyphens(text):
translator = str.maketrans('', '', string.punctuation.replace('-', ''))
return text.translate(translator)
# Define a function to count "re-" words in a given text
def count_re_words(text):
words = nltk.word_tokenize(text)
return sum(1 for word in words if word.lower().startswith("re-"))
# Specify the directory paths for the two poets' corpora
corpus_directories = {
'swinburne': '/home/adammazel/Documents/Digital_Scholarship/re-victorian-poetry/cta/swinburne/swinburne_noBP',
'hardy': '/home/adammazel/Documents/Digital_Scholarship/re-victorian-poetry/cta/hardy/hardy_noBP',
'michael field': '/home/adammazel/Documents/Digital_Scholarship/re-victorian-poetry/cta/field/field_NoBP',
'dg rossetti': '/home/adammazel/Documents/Digital_Scholarship/re-victorian-poetry/cta/rossetti_dg/rossetti_dg_NoBP',
# Add more directories here
}
# Initialize dictionaries to store the results
percentage_re_words = {}
# Read, tokenize, and calculate for each poet's corpus
for poet, corpus_directory in corpus_directories.items():
corpus = []
# Read and tokenize the text files in the poet's corpus
for filename in os.listdir(corpus_directory):
with open(os.path.join(corpus_directory, filename), 'r', encoding='utf-8') as file:
text = file.read()
text = remove_punctuation_except_hyphens(text)
corpus.append(text)
# Count the "re-" words in the poet's corpus
re_word_count = sum(count_re_words(text) for text in corpus)
# Calculate the percentage of "re-" words in the poet's corpus
total_words = sum(len(nltk.word_tokenize(text)) for text in corpus)
percentage_re_words[poet] = (re_word_count / total_words) * 100
# Sort the results from largest to smallest
sorted_results = sorted(percentage_re_words.items(), key=lambda x: x[1], reverse=True)
# Extract poets and percentages for plotting
poets, percentages = zip(*sorted_results)
# Step 4: Create a bar chart to visualize the results with dynamic y-axis limit and sorted labels
plt.figure(figsize=(8, 6))
plt.bar(poets, percentages, color=['blue', 'orange'])
plt.ylabel('Percentage (%)')
plt.title('Whose Poetry is More Composed of Words that Start with "Re-"?')
# Set the y-axis limit based on the largest percentage
ylim_percentage = max(percentages) * 2 # Adjusted for better visualization
plt.ylim(0, ylim_percentage)
plt.grid(axis='y', linestyle='--', alpha=0.7)
# Display the bar chart
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()